Read in datasets to be used:
Boundary data: London wards (source: London Data Store)
Attribute data: London wards atlas
TEST DOT POINT
The file is downloaded directly from the Data Store url, and the fs package is used to extract the data from a zip file format.
#If first time running script:
checkfile <- file_exists(here::here("data/statistical-gis-boundaries-london.zip"))
if (checkfile == TRUE){
Londonwards<-fs::dir_info(here::here("data",
"statistical-gis-boundaries-london",
"ESRI")) %>%
#$ means exact match
dplyr::filter(str_detect(path,
"London_Ward_CityMerged.shp$"))%>%
dplyr::select(path)%>%
dplyr::pull()%>%
#read in the file in
sf::st_read()
cat(cyan("\nAlready loaded!\n"))
} else {
download.file("https://data.london.gov.uk/download/statistical-gis-boundary-files-london/9ba8c833-6370-4b11-abdc-314aa020d5e0/statistical-gis-boundaries-london.zip",
destfile="data/statistical-gis-boundaries-london.zip")
listfiles<-dir_info(here::here("data")) %>%
dplyr::filter(str_detect(path, ".zip")) %>%
dplyr::select(path) %>%
pull() %>%
#print out the .gz file
print() %>%
as.character() %>%
utils::unzip(exdir=here::here("data"))
Londonwards<-fs::dir_info(here::here("data",
"statistical-gis-boundaries-london",
"ESRI")) %>%
#$ means exact match
dplyr::filter(str_detect(path,
"London_Ward_CityMerged.shp$"))%>%
dplyr::select(path)%>%
dplyr::pull()%>%
#read in the file in
sf::st_read()
cat(cyan("\nDownloaded from London Data Store\n"))
}
## Reading layer `London_Ward_CityMerged' from data source
## `C:\Users\joepo\Documents\Uni\UCL CASA\CASA0005\gis_wk8\data\statistical-gis-boundaries-london\ESRI\London_Ward_CityMerged.shp'
## using driver `ESRI Shapefile'
## Simple feature collection with 625 features and 7 fields
## Geometry type: POLYGON
## Dimension: XY
## Bounding box: xmin: 503568.2 ymin: 155850.8 xmax: 561957.5 ymax: 200933.9
## Projected CRS: OSGB36 / British National Grid
##
## Already loaded!
# Read in attribute data
LondonWardProfiles <- read_csv(here::here("data","ward-profiles-excel-version.csv"),
#Identify missing values in the source file
na = c("", "NA", "n/a"),
col_names = TRUE,
locale = locale(encoding = 'Latin1')) %>%
clean_names()
## Rows: 660 Columns: 67
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Ward name, Old code, New code
## dbl (64): Population - 2015, Children aged 0-15 - 2015, Working-age (16-64) ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# spec(LondonWardProfiles)
# Read in schools point data
ldn_schools <- read_csv(here::here("data", "all_schools_xy_2016.csv"),
col_names = TRUE,
locale = locale(encoding = 'Latin1')) %>%
# Convert to spatial object
st_as_sf(., coords = c("x","y"),
crs = 4326) %>%
#Filter to secondary schools
filter(PHASE == "Secondary")
## Rows: 3889 Columns: 24
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (14): SCHOOL_NAM, TYPE, PHASE, ADDRESS, TOWN, POSTCODE, STATUS, GENDER, ...
## dbl (7): URN, EASTING, NORTHING, map_icon_l, Primary, x, y
## num (1): OBJECTID
## lgl (2): NEW_URN, OLD_URN
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Plot a map of the shapefile data:
qtm(Londonwards)
qtm(ldn_schools)
# Check CRS for each geofile
check_crs <- append(st_crs(Londonwards), st_crs(ldn_schools))
check_crs[1]
## $input
## [1] "OSGB36 / British National Grid"
check_crs[3]
## $input
## [1] "EPSG:4326"
View a sample of the attribute data:
head(LondonWardProfiles)
## # A tibble: 6 × 67
## ward_name old_c…¹ new_c…² popul…³ child…⁴ worki…⁵ older…⁶ perce…⁷ perce…⁸
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 City of London 00AA E09000… 8100 650 6250 1250 8 76.9
## 2 Barking and D… 00ABFX E05000… 14750 3850 10150 750 26 69
## 3 Barking and D… 00ABFY E05000… 10600 2700 6800 1050 25.7 64.3
## 4 Barking and D… 00ABFZ E05000… 12700 3200 8350 1100 25.4 65.9
## 5 Barking and D… 00ABGA E05000… 10400 2550 6400 1450 24.3 61.5
## 6 Barking and D… 00ABGB E05000… 10750 2150 7050 1550 20.1 65.7
## # … with 58 more variables: percent_all_older_people_aged_65_2015 <dbl>,
## # mean_age_2013 <dbl>, median_age_2013 <dbl>, area_square_kilometres <dbl>,
## # population_density_persons_per_sq_km_2013 <dbl>, percent_bame_2011 <dbl>,
## # percent_not_born_in_uk_2011 <dbl>,
## # percent_english_is_first_language_of_no_one_in_household_2011 <dbl>,
## # general_fertility_rate_2013 <dbl>, male_life_expectancy_2009_13 <dbl>,
## # female_life_expectancy_2009_13 <dbl>, …
Reproject spatial objects to the same CRS.
Merge the attribute data to the wards shapefile.
#Transform the CRS for school points to the British grid
ldn_schools <- st_transform(ldn_schools, crs = 27700)
wards_merged <- Londonwards %>%
left_join(LondonWardProfiles, by = c("GSS_CODE" = "new_code")) %>%
#Select desired columns
#Note that 'geometry' is sticky - i.e. does not need to be specified in select statement to be retained
select(NAME, GSS_CODE, BOROUGH,
population_2015, population_density_persons_per_sq_km_2013,
average_gcse_capped_point_scores_2014,
unauthorised_absence_in_all_schools_percent_2013,
median_house_price_2014)
Map the joined file to check the data
qtm(wards_merged,
fill = "average_gcse_capped_point_scores_2014",
borders = NULL)
Start the investigation by fitting a linear regression model to the dependent and independent variables. The first step here is to check the distributions of the input variables.
dist_gcse <- ggplot(wards_merged,
aes(x = average_gcse_capped_point_scores_2014)) +
geom_histogram(aes(y = ..density..),
binwidth = 5) +
#Add density line
geom_density(colour = "orange",
linewidth = 1, adjust = 1)
dist_gcse
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
dist_absence <- ggplot(wards_merged,
aes(x = unauthorised_absence_in_all_schools_percent_2013)) +
geom_histogram(aes(y = ..density..),
binwidth = 0.1) +
#Add density line
geom_density(colour = "lightblue",
linewidth = 1, adjust = 1)
dist_absence
Both variables are roughly normally distributed.
#Create a regression model on its own
model1 <- wards_merged %>%
lm(average_gcse_capped_point_scores_2014 ~ unauthorised_absence_in_all_schools_percent_2013,
data = .)
summary(model1)
##
## Call:
## lm(formula = average_gcse_capped_point_scores_2014 ~ unauthorised_absence_in_all_schools_percent_2013,
## data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.753 -10.223 -1.063 8.547 61.842
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 371.471 2.165 171.6
## unauthorised_absence_in_all_schools_percent_2013 -41.237 1.927 -21.4
## Pr(>|t|)
## (Intercept) <2e-16 ***
## unauthorised_absence_in_all_schools_percent_2013 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 16.39 on 624 degrees of freedom
## Multiple R-squared: 0.4233, Adjusted R-squared: 0.4224
## F-statistic: 458 on 1 and 624 DF, p-value: < 2.2e-16
The scatterplot below shows GCSE scores by school absences.
# Create a base scatter plot
q <- qplot(x = unauthorised_absence_in_all_schools_percent_2013,
y = average_gcse_capped_point_scores_2014,
data=wards_merged)
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
#Add a regression line
q + stat_smooth(method="lm", se=FALSE, size=1) +
#jitter argument used as the x-scale is rounded
geom_jitter()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## `geom_smooth()` using formula = 'y ~ x'
The assumptions of a linear regression model must then be validated.
These are:
Is there a linear association between the variables?
Are the errors independent?
Are the errors normally distributed?
Do the errors have equal variance?
To test these assumptions, the model first needs to be tidied using
‘broom’ and ‘tidypredict’ packages.
#Create a tibble storing the test statistics
broom::tidy(model1)
## # A tibble: 2 × 5
## term estim…¹ std.e…² stati…³ p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 371. 2.16 172. 0
## 2 unauthorised_absence_in_all_schools_percent_… -41.2 1.93 -21.4 1.27e-76
## # … with abbreviated variable names ¹estimate, ²std.error, ³statistic
#Creates a tibble of the extended regression results
glance(model1)
## # A tibble: 1 × 12
## r.squared adj.r.squa…¹ sigma stati…² p.value df logLik AIC BIC devia…³
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.423 0.422 16.4 458. 1.27e-76 1 -2638. 5282. 5296. 167699.
## # … with 2 more variables: df.residual <int>, nobs <int>, and abbreviated
## # variable names ¹adj.r.squared, ²statistic, ³deviance
#Use tidypredict to add a column of expected result given the model
df_model1 <- wards_merged %>% tidypredict_to_column(model1)
Plot a scatterplot to visualise the relationship. See the scatterplot above for confirmation of this assumption.
Transforming variables for regression
If variables are not normally distributed, they will often need to be
transformed in some way to make them more appropriate for linear
regression. A useful method for this is Tukey’s Ladder,
which tests different levels of power relationships to see which might
have the most normal outcomes.
symbox(~median_house_price_2014,
wards_merged,
na.rm= TRUE,
powers = seq(-3, 3, by=0.5)) #vector of powers to which 'x' is raised (here specified as a sequence)
The Tukey’s ladder suggests that a power relationship of x raised to the power of -1 will have the closest approximation to a normal distribution. To visualise this in a scatterplot:
qplot(x = (median_house_price_2014)^-1,
y = average_gcse_capped_point_scores_2014,
data=wards_merged)
Use the ‘augment’ function to store the fitted values and residuals calculated from the model, to plot a histogram of the residuals. From the plot below, we can be satisfied that assumption 2 has been met.
df_model1 <- augment(model1, wards_merged)
#Plot histogram of the residuals
df_model1 %>% select(., .resid) %>%
pull() %>%
qplot() +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The best way to quickly assess whether assumption 3 has been met, is to produce a scatter plot of residuals versus fitted values.
#Quick plot
# qplot(x = .fitted,
# y = .resid,
# data=df_model1)
#create using ggplot (better practice)
ggplot(data = df_model1,
aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0,
color = "blue") +
labs(title = "Scatter plot of GCSE score fitted values versus residuals")
How to interpret this? If residuals (errors) are independent, we
would expect to see:
The residuals form a roughly horizontal band around the 0 line. This suggests the variance of the errors are equal.
The residuals “bounce randomly” around the 0 line. This suggests that the errors are independent.
No one residual “stands out” from the basic random pattern of residuals. This suggests that there are no outliers.
Attribute data: London wards atlas
TEST DOT POINT
Plotting a linear regression model in R will quickly produce a suite of
plots that can help when assessing assumptions (including the residuals
vs fit plot shown above).
par(mfrow=c(2,2)) #plot in a 2 by 2 array
plot(model1)
An additional method is to test independence of errors (the opposite of ‘autocorrelation’) through a Durbin-Watson test. The value should lie between 1 and 3, for least concern.
#run durbin-watson test
DW <- durbinWatsonTest(model1)
tidy(DW)
## # A tibble: 1 × 5
## statistic p.value autocorrelation method alternative
## <dbl> <dbl> <dbl> <chr> <chr>
## 1 1.58 0 0.209 Durbin-Watson Test two.sided
To advance the analysis, additional independent variables
can be added to the regression model. In this case, median house price
has been added to Model 1. Note that the transformed values for median
house price will be used (as tested in Tukey’s ladder above).
Additional note: unable to successfully run model raising to power
of -1, so log transformed values used instead
model2 <- lm(average_gcse_capped_point_scores_2014 ~
unauthorised_absence_in_all_schools_percent_2013 + log(median_house_price_2014),
data = wards_merged)
tidy(model2)
## # A tibble: 3 × 5
## term estim…¹ std.e…² stati…³ p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 202. 20.1 10.0 4.79e-22
## 2 unauthorised_absence_in_all_schools_percent_… -36.2 1.92 -18.9 3.71e-63
## 3 log(median_house_price_2014) 12.8 1.50 8.50 1.37e-16
## # … with abbreviated variable names ¹estimate, ²std.error, ³statistic
glance(model2)
## # A tibble: 1 × 12
## r.squared adj.r.squa…¹ sigma stati…² p.value df logLik AIC BIC devia…³
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.483 0.482 15.5 291. 4.78e-90 2 -2604. 5215. 5233. 150261.
## # … with 2 more variables: df.residual <int>, nobs <int>, and abbreviated
## # variable names ¹adj.r.squared, ²statistic, ³deviance
df_model2 <- augment(model2, wards_merged)
To check for multicollinearity among variables, need to calculate the correlation coefficient.
Correlation <- wards_merged %>%
st_drop_geometry() %>% #convert from spatial to regular df
select(average_gcse_capped_point_scores_2014, unauthorised_absence_in_all_schools_percent_2013, median_house_price_2014) %>%
mutate(ln_median_house_price_2014 = log(median_house_price_2014)) %>% #Create log-transformed column of median house price
# select(-median_house_price_2014) %>% #Remove the raw column from dataset
correlate()
## Correlation computed with
## • Method: 'pearson'
## • Missing treated using: 'pairwise.complete.obs'
Correlation
## # A tibble: 4 × 5
## term avera…¹ unaut…² media…³ ln_me…⁴
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 average_gcse_capped_point_scores_2014 NA -0.651 0.348 0.434
## 2 unauthorised_absence_in_all_schools_percent_2… -0.651 NA -0.217 -0.308
## 3 median_house_price_2014 0.348 -0.217 NA 0.916
## 4 ln_median_house_price_2014 0.434 -0.308 0.916 NA
## # … with abbreviated variable names ¹average_gcse_capped_point_scores_2014,
## # ²unauthorised_absence_in_all_schools_percent_2013,
## # ³median_house_price_2014, ⁴ln_median_house_price_2014
#Visualise the correlation matrix
# rplot(Correlation %>%
# focus(-average_gcse_capped_point_scores_2014,
# -median_house_price_2014,
# mirror = TRUE)
# )
The VIF is another way to assess multicollinearity, and can be used as a method to reduce unnecessary predictors. A VIF above 10 is considered problematic.
vif(model2)
## unauthorised_absence_in_all_schools_percent_2013
## 1.105044
## log(median_house_price_2014)
## 1.105044
par(mfrow=c(2,2)) #plot in a 2 by 2 array
plot(model2)
This dataset has a spatial component, and therefore a traditional Durbin-Watson test is not particularly appropriate. Instead, need to test for spatial autocorrelation. The first step in this approach is to plot the residuals on a map.
#Convert df_model2 to spatial object
sf_model2 <- df_model2 %>%
st_as_sf(.)
tmap_mode("view")
## tmap mode set to interactive viewing
#Create basemap of wards
tm_shape(sf_model2) +
#Fill polygons with residual value
tm_polygons(".resid",
palette = "RdYlBu") +
#Add points from London schools data
tm_shape(ldn_schools) +
tm_dots(col = "TYPE")
## Variable(s) ".resid" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
The Moran’s I statistic is used to identify the presence of spatial autocorrelation in a dataset. To calculate Moran’s I, need to create a spatial weights matrix based off the neighbours of each polygon. THerefore, the first step is to calculate the centroid of each polygon to use in a nearest-neighbours calculation.
#Calculate centroids
wards_coords <- wards_merged %>%
st_centroid() %>%
st_geometry()
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over
## geometries of x
#Create binary matrix of neighbours
wards_nb <- wards_merged %>%
poly2nb(queen = TRUE)
plot(wards_nb,
st_geometry(wards_coords),
col = "blue")
Can also calculate neighbours using the k nearest neighbours method (based off distance from centroid)
wards_nbknn <- wards_coords %>%
knearneigh(k=4) %>%
knn2nb
## Warning in knearneigh(., k = 4): knearneigh: identical points found
## Warning in knearneigh(., k = 4): knearneigh: kd_tree not available for identical
## points
plot(wards_nbknn,
st_geometry(wards_coords),
col="green")
Use the neighbours matrix to calculate spatial weights.
swm_queens <- wards_nb %>% nb2listw(style = "W") #'W' style indicates a row standardised matrix
swm_knn <- wards_nbknn %>% nb2listw(style = "W")
Finally, use the spatial weights matrix to calculate the Moran’s I test statistic.
moransI_queens <- df_model2 %>%
st_drop_geometry() %>%
select(.resid) %>%
pull() %>%
moran.test(., swm_queens) %>%
tidy()
moransI_queens
## # A tibble: 1 × 7
## estimate1 estimate2 estimate3 statistic p.value method alter…¹
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 0.282 -0.0016 0.000556 12.0 1.54e-33 Moran I test under r… greater
## # … with abbreviated variable name ¹alternative
moransI_knn <- df_model2 %>%
st_drop_geometry() %>%
select(.resid) %>%
pull() %>%
moran.test(., swm_knn) %>%
tidy()
moransI_knn
## # A tibble: 1 × 7
## estimate1 estimate2 estimate3 statistic p.value method alter…¹
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
## 1 0.292 -0.0016 0.000718 10.9 3.78e-28 Moran I test under r… greater
## # … with abbreviated variable name ¹alternative
For both our neighbours models, the Moran’s I falls between (0.28,0.30). The Moran’s I ranges from (-1,1), indicating that we are observing some weak positive correlation (clustering). /